source('utils.R')

#devtools::install_github("ujjwalkarn/xda")

library(knitr)

library(ggjoy)

#devtools::install_github("vsimko/corrplot")
library(corrplot)
# First run feature-engineering chunk from preprocessing.Rmd
df <- load_data()

df <- introduce_nas(df, 22.5, 'pH')

df <- df %>% 
        mutate(`other sulfur dioxide` = `total sulfur dioxide` - `free sulfur dioxide`) %>%
        select(-`total sulfur dioxide`)

Data dictionary

https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names

Input variables (based on physicochemical tests): 1 - fixed acidity 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide 7 - total sulfur dioxide -> Removed 8 - other sulfur dioxide -> Created from ‘total sulfur dioxide’ - ‘free sulfur dioxide’ 9 - density 10 - pH 11 - sulphates 12 - alcohol 13 - wine_colour Output variable (based on sensory data): 14 - quality (score between 0 and 10)

  1. Relevant Information:

The two datasets are related to red and white variants of the Portuguese “Vinho Verde” wine. For more details, consult: http://www.vinhoverde.pt/en/ or the reference [Cortez et al., 2009]. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).

These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are munch more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.

  1. Number of Instances: red wine - 1599; white wine - 4898.

  2. Number of Attributes: 12 + output attribute

Note: several of the attributes may be correlated, thus it makes sense to apply some sort of feature selection.

xda::numSummary(df)
##                         n   mean     sd     max   min    range nunique
## fixed acidity        6497  7.215  1.296  15.900 3.800  12.1000     106
## volatile acidity     6497  0.340  0.165   1.580 0.080   1.5000     187
## citric acid          6497  0.319  0.145   1.660 0.000   1.6600      89
## residual sugar       6497  5.443  4.758  65.800 0.600  65.2000     316
## chlorides            6497  0.056  0.035   0.611 0.009   0.6020     214
## free sulfur dioxide  6497 30.525 17.749 289.000 1.000 288.0000     135
## density              6497  0.995  0.003   1.039 0.987   0.0519     998
## pH                   5036  3.220  0.161   4.010 2.720   1.2900     108
## sulphates            6497  0.531  0.149   2.000 0.220   1.7800     111
## alcohol              6497 10.492  1.193  14.900 8.000   6.9000     111
## quality              6497  5.818  0.873   9.000 3.000   6.0000       7
## other sulfur dioxide 6495 85.237 45.418 331.000 3.000 328.0000     251
##                      nzeros      iqr lowerbound upperbound noutlier
## fixed acidity             0  1.30000     4.4500      9.650      357
## volatile acidity          0  0.17000    -0.0250      0.655      377
## citric acid             151  0.14000     0.0400      0.600      509
## residual sugar            0  6.30000    -7.6500     17.550      118
## chlorides                 0  0.02700    -0.0025      0.106      286
## free sulfur dioxide       0 24.00000   -19.0000     77.000       62
## density                   0  0.00465     0.9854      1.004        3
## pH                        0  0.21000     2.7950      3.635       58
## sulphates                 0  0.17000     0.1750      0.855      191
## alcohol                   0  1.80000     6.8000     14.000        3
## quality                   0  1.00000     3.5000      7.500      228
## other sulfur dioxide      0 61.00000   -36.5000    207.500       20
##                      kurtosis skewness    mode miss   miss%    1%     5%
## fixed acidity           5.054    1.722   6.800    0  0.0000 5.100  5.700
## volatile acidity        2.820    1.494   0.280    0  0.0000 0.120  0.160
## citric acid             2.393    0.472   0.300    0  0.0000 0.000  0.050
## residual sugar          4.353    1.435   2.000    0  0.0000 0.900  1.200
## chlorides              50.841    5.397   0.044    0  0.0000 0.021  0.028
## free sulfur dioxide     7.896    1.220  29.000    0  0.0000 4.000  6.000
## density                 6.597    0.503   0.997    0  0.0000 0.989  0.990
## pH                      0.348    0.378      NA 1461 22.4873 2.890  2.970
## sulphates               8.643    1.796   0.500    0  0.0000 0.300  0.350
## alcohol                -0.533    0.565   9.500    0  0.0000 8.700  9.000
## quality                 0.230    0.190   6.000    0  0.0000 4.000  5.000
## other sulfur dioxide   -0.322    0.101 101.000    2  0.0308 6.000 10.000
##                         25%    50%     75%     95%     99%
## fixed acidity         6.400  7.000   7.700   9.800  12.000
## volatile acidity      0.230  0.290   0.400   0.670   0.880
## citric acid           0.250  0.310   0.390   0.560   0.740
## residual sugar        1.800  3.000   8.100  15.000  18.200
## chlorides             0.038  0.047   0.065   0.102   0.186
## free sulfur dioxide  17.000 29.000  41.000  61.000  77.000
## density               0.992  0.995   0.997   0.999   1.001
## pH                    3.110  3.210   3.320   3.500   3.640
## sulphates             0.430  0.510   0.600   0.790   0.990
## alcohol               9.500 10.300  11.300  12.700  13.400
## quality               5.000  6.000   6.000   7.000   8.000
## other sulfur dioxide 55.000 86.000 116.000 159.000 189.000
xda::charSummary(df)
##                n miss miss% unique     top5levels:count
## wine_colour 6497    0     0      2 white:4898, red:1599
plot_histograms(df)

## Warning: Removed 1 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing non-finite values (stat_bin).

## Warning: Removed 1461 rows containing non-finite values (stat_bin).

plot_boxplots(df)

## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1461 rows containing non-finite values (stat_boxplot).

plot_joyplots(df)
## Picking joint bandwidth of 0.223

## Picking joint bandwidth of 0.0252

## Picking joint bandwidth of 0.0274

## Picking joint bandwidth of 0.471

## Picking joint bandwidth of 0.0024

## Picking joint bandwidth of 2.47

## Picking joint bandwidth of 4.93
## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).

## Picking joint bandwidth of 0.000418

## Picking joint bandwidth of 0.0276
## Warning: Removed 1461 rows containing non-finite values
## (stat_density_ridges).

## Picking joint bandwidth of 0.0224

## Picking joint bandwidth of 0.211

## Picking joint bandwidth of 0.138

other_plots(df)

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1461 rows containing non-finite values (stat_boxplot).

boxplots_quality_colour_wine(df, 'red')

## Warning: Removed 2 rows containing non-finite values (stat_boxplot).

## Warning: Removed 363 rows containing non-finite values (stat_boxplot).

boxplots_quality_colour_wine(df, 'white')

## Warning: Removed 1098 rows containing non-finite values (stat_boxplot).

corrs <- cor(drop_na(df) %>% select(-wine_colour))

corrs
##                      fixed acidity volatile acidity citric acid
## fixed acidity               1.0000           0.2277      0.3282
## volatile acidity            0.2277           1.0000     -0.3769
## citric acid                 0.3282          -0.3769      1.0000
## residual sugar             -0.1062          -0.1896      0.1506
## chlorides                   0.3158           0.4070      0.0181
## free sulfur dioxide        -0.2847          -0.3530      0.1284
## density                     0.4646           0.2828      0.1018
## pH                         -0.2583           0.2633     -0.3300
## sulphates                   0.2944           0.2276      0.0434
## alcohol                    -0.1064          -0.0441     -0.0105
## quality                    -0.0855          -0.2668      0.0890
## other sulfur dioxide       -0.3039          -0.3780      0.1847
##                      residual sugar chlorides free sulfur dioxide density
## fixed acidity               -0.1062    0.3158             -0.2847  0.4646
## volatile acidity            -0.1896    0.4070             -0.3530  0.2828
## citric acid                  0.1506    0.0181              0.1284  0.1018
## residual sugar               1.0000   -0.1252              0.3948  0.5562
## chlorides                   -0.1252    1.0000             -0.2150  0.3825
## free sulfur dioxide          0.3948   -0.2150              1.0000  0.0205
## density                      0.5562    0.3825              0.0205  1.0000
## pH                          -0.2642    0.0608             -0.1316  0.0104
## sulphates                   -0.1813    0.3706             -0.1847  0.2573
## alcohol                     -0.3571   -0.2649             -0.1695 -0.6857
## quality                     -0.0382   -0.2139              0.0604 -0.3112
## other sulfur dioxide         0.4544   -0.2845              0.4964  0.0273
##                           pH sulphates   alcohol quality
## fixed acidity        -0.2583  0.294395 -0.106399 -0.0855
## volatile acidity      0.2633  0.227578 -0.044143 -0.2668
## citric acid          -0.3300  0.043369 -0.010538  0.0890
## residual sugar       -0.2642 -0.181263 -0.357098 -0.0382
## chlorides             0.0608  0.370602 -0.264854 -0.2139
## free sulfur dioxide  -0.1316 -0.184732 -0.169486  0.0604
## density               0.0104  0.257265 -0.685664 -0.3112
## pH                    1.0000  0.201430  0.119000  0.0233
## sulphates             0.2014  1.000000 -0.000449  0.0397
## alcohol               0.1190 -0.000449  1.000000  0.4519
## quality               0.0233  0.039660  0.451941  1.0000
## other sulfur dioxide -0.2296 -0.260291 -0.253983 -0.0703
##                      other sulfur dioxide
## fixed acidity                     -0.3039
## volatile acidity                  -0.3780
## citric acid                        0.1847
## residual sugar                     0.4544
## chlorides                         -0.2845
## free sulfur dioxide                0.4964
## density                            0.0273
## pH                                -0.2296
## sulphates                         -0.2603
## alcohol                           -0.2540
## quality                           -0.0703
## other sulfur dioxide               1.0000
corrplot.mixed(corrs, upper="ellipse", lower="number")

corrs_red <- cor(drop_na(df) %>% filter(wine_colour == 'red') %>% select(-wine_colour))

corrs_red
##                      fixed acidity volatile acidity citric acid
## fixed acidity               1.0000         -0.25350      0.6785
## volatile acidity           -0.2535          1.00000     -0.5502
## citric acid                 0.6785         -0.55016      1.0000
## residual sugar              0.1307          0.00317      0.1428
## chlorides                   0.0917          0.08131      0.1671
## free sulfur dioxide        -0.1421         -0.01381     -0.0668
## density                     0.6607          0.03383      0.3536
## pH                         -0.6923          0.24452     -0.5520
## sulphates                   0.1759         -0.26871      0.2844
## alcohol                    -0.0679         -0.20490      0.1157
## quality                     0.1322         -0.39549      0.2449
## other sulfur dioxide       -0.0838          0.10177      0.0543
##                      residual sugar chlorides free sulfur dioxide  density
## fixed acidity               0.13072    0.0917            -0.14215  0.66071
## volatile acidity            0.00317    0.0813            -0.01381  0.03383
## citric acid                 0.14279    0.1671            -0.06681  0.35360
## residual sugar              1.00000    0.0766             0.22751  0.36876
## chlorides                   0.07662    1.0000            -0.01434  0.19991
## free sulfur dioxide         0.22751   -0.0143             1.00000  0.00992
## density                     0.36876    0.1999             0.00992  1.00000
## pH                         -0.09060   -0.2295             0.07305 -0.33358
## sulphates                   0.01888    0.2948             0.06838  0.14063
## alcohol                     0.03307   -0.2187            -0.07244 -0.51141
## quality                     0.02677   -0.1248            -0.03584 -0.17299
## other sulfur dioxide        0.17310    0.0557             0.43171  0.09298
##                           pH sulphates alcohol quality
## fixed acidity        -0.6923    0.1759 -0.0679  0.1322
## volatile acidity      0.2445   -0.2687 -0.2049 -0.3955
## citric acid          -0.5520    0.2844  0.1157  0.2449
## residual sugar       -0.0906    0.0189  0.0331  0.0268
## chlorides            -0.2295    0.2948 -0.2187 -0.1248
## free sulfur dioxide   0.0730    0.0684 -0.0724 -0.0358
## density              -0.3336    0.1406 -0.5114 -0.1730
## pH                    1.0000   -0.1736  0.1903 -0.0778
## sulphates            -0.1736    1.0000  0.1028  0.2696
## alcohol               0.1903    0.1028  1.0000  0.4768
## quality              -0.0778    0.2696  0.4768  1.0000
## other sulfur dioxide -0.1148    0.0367 -0.2302 -0.1925
##                      other sulfur dioxide
## fixed acidity                     -0.0838
## volatile acidity                   0.1018
## citric acid                        0.0543
## residual sugar                     0.1731
## chlorides                          0.0557
## free sulfur dioxide                0.4317
## density                            0.0930
## pH                                -0.1148
## sulphates                          0.0367
## alcohol                           -0.2302
## quality                           -0.1925
## other sulfur dioxide               1.0000
corrplot.mixed(corrs_red, upper="ellipse", lower="number")

corrs_white <- cor(drop_na(df) %>% filter(wine_colour == 'white') %>% select(-wine_colour))

corrs_white
##                      fixed acidity volatile acidity citric acid
## fixed acidity               1.0000          -0.0285      0.2989
## volatile acidity           -0.0285           1.0000     -0.1543
## citric acid                 0.2989          -0.1543      1.0000
## residual sugar              0.0954           0.0747      0.1090
## chlorides                   0.0338           0.0757      0.1198
## free sulfur dioxide        -0.0579          -0.0985      0.0909
## density                     0.2753           0.0355      0.1636
## pH                         -0.4338          -0.0250     -0.1652
## sulphates                  -0.0245          -0.0349      0.0576
## alcohol                    -0.1320           0.0652     -0.0797
## quality                    -0.1241          -0.1832     -0.0143
## other sulfur dioxide        0.1286           0.1583      0.0987
##                      residual sugar chlorides free sulfur dioxide density
## fixed acidity                0.0954    0.0338            -0.05788  0.2753
## volatile acidity             0.0747    0.0757            -0.09849  0.0355
## citric acid                  0.1090    0.1198             0.09090  0.1636
## residual sugar               1.0000    0.1027             0.29086  0.8406
## chlorides                    0.1027    1.0000             0.08545  0.2698
## free sulfur dioxide          0.2909    0.0854             1.00000  0.2819
## density                      0.8406    0.2698             0.28193  1.0000
## pH                          -0.1938   -0.0983             0.01172 -0.0959
## sulphates                   -0.0267    0.0128             0.05330  0.0710
## alcohol                     -0.4484   -0.3663            -0.23854 -0.7744
## quality                     -0.1030   -0.2215             0.00725 -0.3100
## other sulfur dioxide         0.3442    0.1976             0.25333  0.4980
##                           pH sulphates alcohol  quality
## fixed acidity        -0.4338   -0.0245 -0.1320 -0.12405
## volatile acidity     -0.0250   -0.0349  0.0652 -0.18318
## citric acid          -0.1652    0.0576 -0.0797 -0.01428
## residual sugar       -0.1938   -0.0267 -0.4484 -0.10295
## chlorides            -0.0983    0.0128 -0.3663 -0.22152
## free sulfur dioxide   0.0117    0.0533 -0.2385  0.00725
## density              -0.0959    0.0710 -0.7744 -0.30998
## pH                    1.0000    0.1641  0.1251  0.11260
## sulphates             0.1641    1.0000 -0.0109  0.05610
## alcohol               0.1251   -0.0109  1.0000  0.44445
## quality               0.1126    0.0561  0.4444  1.00000
## other sulfur dioxide  0.0099    0.1414 -0.4182 -0.22366
##                      other sulfur dioxide
## fixed acidity                      0.1286
## volatile acidity                   0.1583
## citric acid                        0.0987
## residual sugar                     0.3442
## chlorides                          0.1976
## free sulfur dioxide                0.2533
## density                            0.4980
## pH                                 0.0099
## sulphates                          0.1414
## alcohol                           -0.4182
## quality                           -0.2237
## other sulfur dioxide               1.0000
corrplot.mixed(corrs_white, upper="ellipse", lower="number")